Reproducible Bankruptcy Classifier Analysis

Libraries

library(h2o)
library(tidyverse)
library(plotly)

# Not in container
library(recipes)
library(embed)

Data

Bankruptcy Classification Features

data_prepared_tbl <- read_rds("00_data/data_prepared_tbl.rds")
data_prepared_tbl
## # A tibble: 4,998 x 64
##    class    Attr1 Attr2  Attr3 Attr4 Attr5  Attr6   Attr7 Attr8 Attr9
##    <fct>    <dbl> <dbl>  <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl>
##  1 0      0.0882  0.555 0.0113  1.02 -66.5 0.342  0.109   0.578  1.09
##  2 0      0.130   0.221 0.578   3.61 120.  0.188  0.162   3.06   1.14
##  3 0      0.0482  0.550 0.108   1.24 -23.0 0      0.0593  0.817  1.52
##  4 0      0.0995  0.600 0.375   1.65  19.0 0.211  0.124   0.667  1.10
##  5 0      0.0785  0.205 0.104   2.79  77.8 0.365  0.0934  3.87   1.23
##  6 0      0.125   0.354 0.314   2.71  17.9 0.306  0.158   1.82   1.24
##  7 0      0.185   0.340 0.383   2.13  54.4 0.630  0.231   1.84   1.15
##  8 0      0.0905  0.314 0.425   3.21  24.9 0.0557 0.105   2.15   1.05
##  9 0     -0.00213 0.251 0.351   2.48  31.9 0.124  0.00590 2.34   1.06
## 10 0      0.136   0.296 0.477   2.61  70.9 0.414  0.169   2.34   1.16
## # … with 4,988 more rows, and 54 more variables: Attr10 <dbl>,
## #   Attr11 <dbl>, Attr12 <dbl>, Attr13 <dbl>, Attr14 <dbl>, Attr15 <dbl>,
## #   Attr16 <dbl>, Attr17 <dbl>, Attr18 <dbl>, Attr19 <dbl>, Attr20 <dbl>,
## #   Attr21 <dbl>, Attr22 <dbl>, Attr23 <dbl>, Attr24 <dbl>, Attr25 <dbl>,
## #   Attr26 <dbl>, Attr27 <dbl>, Attr28 <dbl>, Attr29 <dbl>, Attr30 <dbl>,
## #   Attr31 <dbl>, Attr32 <dbl>, Attr33 <dbl>, Attr34 <dbl>, Attr35 <dbl>,
## #   Attr36 <dbl>, Attr38 <dbl>, Attr39 <dbl>, Attr40 <dbl>, Attr41 <dbl>,
## #   Attr42 <dbl>, Attr43 <dbl>, Attr44 <dbl>, Attr45 <dbl>, Attr46 <dbl>,
## #   Attr47 <dbl>, Attr48 <dbl>, Attr49 <dbl>, Attr50 <dbl>, Attr51 <dbl>,
## #   Attr52 <dbl>, Attr53 <dbl>, Attr54 <dbl>, Attr55 <dbl>, Attr56 <dbl>,
## #   Attr57 <dbl>, Attr58 <dbl>, Attr59 <dbl>, Attr60 <dbl>, Attr61 <dbl>,
## #   Attr62 <dbl>, Attr63 <dbl>, Attr64 <dbl>

Bankruptcy Classification Feature Descriptions

data_dictionary_raw_tbl <- read_rds("00_data/data_dictionary_raw_tbl.rds")

data_dictionary_tbl <- data_dictionary_raw_tbl %>%
    separate(
        `Attribute.Information:`,
        into = c("id", "desc"),
        sep = " ",
        extra = "merge"
    ) %>%
    mutate(id = str_replace(id, "X", "Attr"))

data_dictionary_tbl
## # A tibble: 64 x 2
##    id     desc                                                             
##    <chr>  <chr>                                                            
##  1 Attr1  net profit / total assets                                        
##  2 Attr2  total liabilities / total assets                                 
##  3 Attr3  working capital / total assets                                   
##  4 Attr4  current assets / short-term liabilities                          
##  5 Attr5  [(cash + short-term securities + receivables - short-term liabil…
##  6 Attr6  retained earnings / total assets                                 
##  7 Attr7  EBIT / total assets                                              
##  8 Attr8  book value of equity / total liabilities                         
##  9 Attr9  sales / total assets                                             
## 10 Attr10 equity / total assets                                            
## # … with 54 more rows

H2O Prediction Analysis

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 minutes 53 seconds 
##     H2O cluster timezone:       Etc/UTC 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.30.0.1 
##     H2O cluster version age:    4 months and 1 day !!! 
##     H2O cluster name:           H2O_started_from_R_rstudio_cyf940 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.61 GB 
##     H2O cluster total cores:    6 
##     H2O cluster allowed cores:  6 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, XGBoost, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

Load Production Model

path <- file.path(rprojroot::find_rstudio_root_file(),
                  "00_production_model/PROD_H2O_MODEL")

h2o_model <- h2o.loadModel(path)

Make Predictions

predictions_tbl <- h2o.predict(h2o_model, newdata = as.h2o(data_prepared_tbl)) %>%
    as_tibble()
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
predictions_tbl
## # A tibble: 4,998 x 3
##    predict    p0      p1
##    <fct>   <dbl>   <dbl>
##  1 0       0.994 0.00584
##  2 0       0.998 0.00155
##  3 0       0.988 0.0119 
##  4 0       0.994 0.00636
##  5 0       0.997 0.00292
##  6 0       0.997 0.00306
##  7 0       0.998 0.00215
##  8 0       0.998 0.00244
##  9 0       0.998 0.00237
## 10 0       0.997 0.00296
## # … with 4,988 more rows

Bankruptcy UMAP Visualization

Apply UMAP

recipe_spec <- recipe(class ~ ., data_prepared_tbl) %>%
    step_normalize(contains("Attr")) %>%
    step_umap(contains("Attr"), outcome = vars(class), num_comp = 3, seed = c(123, 123))

umap_data_tbl <- recipe_spec %>% prep() %>% juice()
umap_data_tbl
## # A tibble: 4,998 x 4
##    class  umap_1 umap_2  umap_3
##    <fct>   <dbl>  <dbl>   <dbl>
##  1 0     -1.53    2.79   0.761 
##  2 0      1.25    1.02   1.14  
##  3 0     -1.09    2.77  -0.0291
##  4 0     -1.10    0.510  1.52  
##  5 0     -0.320   2.30   0.685 
##  6 0      0.268   2.04   0.949 
##  7 0      0.862   1.52   1.23  
##  8 0      0.815   0.740  2.20  
##  9 0      0.0856  0.469  1.47  
## 10 0      0.636   1.41   1.04  
## # … with 4,988 more rows

Plotly Visualization

# Create tooltip/Hover ----
plot_data_tbl <- umap_data_tbl %>%
    bind_cols(
        data_prepared_tbl %>%
            rowid_to_column(var = ".id") %>%
            select(.id, Attr39, Attr56, Attr26, Attr22),
        predictions_tbl
    ) %>%
    mutate(tooltip = str_glue(
        "
        Company ID: {.id}
        Class: {class}
        Bankruptcy Probability: {scales::percent(p1, accuracy = 0.1)}
        Attr 39 Profit on Sales / Sales: {Attr39}
        Attr 26 (net profit + depreciation) / total liabilities: {Attr26}
        Attr 22 profit on operating activities / total assets: {Attr22}
        Attr56 (sales - cost of products sold) / sales: {Attr56}
        "
    ))

# Plotly Visualization ----
plot_data_tbl %>%
    plot_ly(x = ~ umap_1, y = ~ umap_2, z = ~ umap_3,
            color = ~ class, colors = c('#BF382A', '#0C4B8E'),
            hovertemplate = ~ tooltip) %>%
    add_markers(opacity = 0.5)